Oral cancer, a major subtype of head and neck cancers, remains one of the most aggressive malignancies globally. Oral squamous cell carcinoma (OSCC) often arises from premalignant lesions such as leukoplakia and progresses through intermediate pathological stages, including dysplasia and hyperkeratosis with no recurrence (HkNR), before evolving into invasive cancer. Despite therapeutic advancements, early detection remains a challenge, contributing to a persistently low 5-year survival rate.
In this study, we analyze transcriptomic data from the GEO dataset GSE227919, which includes samples from healthy controls, dysplasia, HkNR, and cancer tissues. By performing differential expression analysis across these sequential stages—Dysplasia vs Control, HkNR vs Dysplasia, and Cancer vs HkNR—we aim to capture key transcriptional changes along the disease trajectory. Enrichment analyses, including g:Profiler and GSEA (Gene Set Enrichment Analysis), are employed to identify dysregulated pathways. This approach enables us to explore the molecular landscape of oral cancer progression and potentially highlight targets for early intervention and therapeutic development.
library(tidyverse)
library(limma)
library(edgeR)
library(pheatmap)
library(plotly)
library(VennDiagram)
library(gprofiler2)
library(clusterProfiler)
library(msigdbr)
library(ggvenn)
library(DT)
library(knitr)
library(purrr)
expression_data <- read.table("project_data.txt", header = TRUE, row.names = 1) %>% as.matrix()
metadata <- read.csv("project_metadata.csv")
common_samples <- intersect(colnames(expression_data), metadata$sample)
expr_mat <- expression_data[, common_samples]
meta4 <- metadata %>% filter(sample %in% common_samples) %>% arrange(match(sample, common_samples))
rownames(meta4) <- meta4$sample
meta4$Stage <- factor(meta4$group, levels = c("Control", "Dysplasia", "HkNR", "Cancer"), ordered = TRUE)
log_expr <- log2(expr_mat + 1)
pca <- prcomp(t(log_expr), scale. = TRUE)
pca_df <- as.data.frame(pca$x)
pca_df$Stage <- meta4$Stage
ggplot(pca_df, aes(x = PC1, y = PC2, color = Stage)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA of Samples")
design4 <- model.matrix(~0 + Stage, data = meta4)
colnames(design4) <- levels(meta4$Stage)
dge <- DGEList(counts = expr_mat)
dge <- calcNormFactors(dge)
v <- voom(dge, design4, plot = FALSE)
fit <- lmFit(v, design4)
cont4 <- makeContrasts(
Dys_vs_Control = Dysplasia - Control,
HkNR_vs_Dys = HkNR - Dysplasia,
Canc_vs_HkNR = Cancer - HkNR,
Canc_vs_Control = Cancer - Control,
levels = design4
)
fit2 <- contrasts.fit(fit, cont4) %>% eBayes()
de_Dys_vs_Control <- topTable(fit2, coef = "Dys_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_HkNR_vs_Dys <- topTable(fit2, coef = "HkNR_vs_Dys", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_HkNR <- topTable(fit2, coef = "Canc_vs_HkNR", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_Control <- topTable(fit2, coef = "Canc_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
get_top_deg_table <- function(de_df, contrast_label) {
de_df <- de_df %>%
mutate(Direction = case_when(
adj.P.Val < 0.05 & logFC > 1 ~ "Upregulated",
adj.P.Val < 0.05 & logFC < -1 ~ "Downregulated",
TRUE ~ "Not Significant"
)) %>%
filter(Direction %in% c("Upregulated", "Downregulated")) %>%
arrange(desc(abs(logFC))) %>%
slice(1:20)
cat(paste0("### Top 20 DEGs: ", contrast_label, "\n\n"))
datatable(de_df, options = list(pageLength = 10), rownames = FALSE)
}
get_top_deg_table(de_Dys_vs_Control, "Dysplasia vs Control")
get_top_deg_table(de_HkNR_vs_Dys, "HkNR vs Dysplasia")
get_top_deg_table(de_Canc_vs_HkNR, "Cancer vs HkNR")
get_top_deg_table(de_Canc_vs_Control, "Cancer vs Control")
plot_volcano <- function(df, title) {
df <- df %>% mutate(logP = -log10(adj.P.Val + 1e-300),
Direction = case_when(
adj.P.Val < 0.05 & logFC > 1 ~ "Up",
adj.P.Val < 0.05 & logFC < -1 ~ "Down",
TRUE ~ "NS"))
plot_ly(df, x = ~logFC, y = ~logP, color = ~Direction, colors = c("blue", "grey", "red"),
text = ~paste("Gene:", geneID, "<br>logFC:", round(logFC,2), "<br>FDR:", signif(adj.P.Val, 3)),
hoverinfo = "text", type = 'scatter', mode = 'markers') %>%
layout(title = title, xaxis = list(title = "log2FC"), yaxis = list(title = "-log10(FDR)"))
}
plot_volcano(de_Dys_vs_Control, "Volcano: Dysplasia vs Control")
plot_volcano(de_HkNR_vs_Dys, "Volcano: HkNR vs Dysplasia")
plot_volcano(de_Canc_vs_HkNR, "Volcano: Cancer vs HkNR")
plot_volcano(de_Canc_vs_Control, "Volcano: Cancer vs Control")
genes1 <- de_Dys_vs_Control %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes2 <- de_HkNR_vs_Dys %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes3 <- de_Canc_vs_HkNR %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
venn_list <- list(
"Dys vs Control" = genes1,
"HkNR vs Dys" = genes2,
"Cancer vs HkNR" = genes3
)
ggvenn(venn_list, fill_color = c("red", "green", "blue"), text_size = 4, set_name_size = 5)
top20 <- de_Dys_vs_Control %>% arrange(adj.P.Val) %>% slice_head(n = 20) %>% pull(geneID)
heat_mat <- log_expr[top20, ]
ord_samps <- meta4 %>% arrange(Stage) %>% pull(sample)
ann <- data.frame(Stage = meta4$Stage)
rownames(ann) <- meta4$sample
pheatmap(heat_mat[, ord_samps], annotation_col = ann[ord_samps,, drop = FALSE],
cluster_rows = TRUE, cluster_cols = FALSE, show_rownames = TRUE,
main = "Top DEGs Across Stages")
sig_genes <- genes3
gostres <- gost(query = sig_genes, organism = "hsapiens", correction_method = "fdr")
gostplot(gostres, interactive = TRUE, capped = TRUE)
top_genes <- unique(c(
de_Dys_vs_Control %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
de_HkNR_vs_Dys %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
de_Canc_vs_HkNR %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID)
))
expr_long <- log_expr[top_genes, ] %>% as.data.frame() %>%
rownames_to_column("Gene") %>%
pivot_longer(-Gene, names_to = "sample", values_to = "logExpr") %>%
left_join(meta4, by = "sample")
avg_expr <- expr_long %>% group_by(Gene, Stage) %>% summarise(meanExpr = mean(logExpr), .groups = "drop")
ggplot(avg_expr, aes(x = Stage, y = meanExpr, group = 1)) +
geom_line(color = "steelblue", size = 1) +
geom_point(size = 2) +
facet_wrap(~ Gene, scales = "free_y") +
theme_minimal(base_size = 13) +
labs(title = "Gene-wise Expression Trajectories", y = "Mean log2(Expression + 1)", x = "Stage")
library(BiocParallel)
BPPARAM <- SerialParam() # Single line fix for parallel issues
hs_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
select(gs_name, gene_symbol)
rank_and_run_gsea <- function(de_df, label) {
ranked <- deframe(de_df[, c("geneID", "logFC")]) %>% sort(decreasing = TRUE)
GSEA(ranked, TERM2GENE = hs_c2, verbose = FALSE, BPPARAM = BPPARAM)@result %>%
transmute(ID, Description, NES, p.adjust, Stage = label)
}
df_gsea <- bind_rows(
rank_and_run_gsea(de_Dys_vs_Control, "Dysplasia vs Control"),
rank_and_run_gsea(de_HkNR_vs_Dys, "HkNR vs Dysplasia"),
rank_and_run_gsea(de_Canc_vs_HkNR, "Cancer vs HkNR")
)
top_terms <- df_gsea %>%
filter(p.adjust < 0.05) %>%
group_by(ID) %>%
summarise(mean_NES = mean(abs(NES)), .groups = "drop") %>%
slice_max(mean_NES, n = 10) %>%
pull(ID)
ggplot(df_gsea %>% filter(ID %in% top_terms),
aes(x = Stage, y = NES, group = ID, color = ID)) +
geom_line(size = 1) + geom_point(size = 2) +
theme_minimal(base_size = 13) +
labs(title = "GSEA: NES Trajectories Across Stages", x = "Comparison", y = "NES") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))